Setup
Load Packages & Functions
# Load functions and packages that are necessary
source("functions.R")
library(plotly)
# We'd like to plot with pretty colors based on national park posters :)
# install.packages("devtools")
#devtools::install_github("katiejolly/nationalparkcolors")
library(nationalparkcolors)
# Assign the 4 colors for the 4 different depths
colors4 <- park_palette("Saguaro", 4)
Load Data
# Import the tax data
tax_physeq <- import_gtdbtk_taxonomy_and_checkm(
taxonomy_filename = "Tara_Oceans_Med/TOBG-MED-READCOUNTMATCH.bac120.tsv",
checkm_filename = "Tara_Oceans_Med/TOBG-MED_qa.txt")
# Import the metadata
meta_physeq <- import_metadata(province_filename = "Tara_Oceans_Med/Sample-Province.tsv",
sizeFraction_filename = "Tara_Oceans_Med/Sample-SizeFraction.tsv") %>%
mutate(names = rownames(.)) %>%
separate(., col = names, into = c("station", "fraction", "depth"), sep = "_") %>%
mutate(r_names = paste(station, fraction, depth, sep = "_")) %>%
column_to_rownames(var = "r_names") %>%
dplyr::select(-c(station, fraction)) %>%
sample_data()
# Import the readcounts
mag_table <- import_readcounts(readcounts_filename = "Tara_Oceans_Med/TOBG-MED-TOTAL.readcounts")
# Import the tree
mag_tree <- import_tree(tree_filename = "Tara_Oceans_Med/GToTree_output.newick")
# Put into phyloseq object
tara_physeq <- phyloseq(meta_physeq, tax_physeq, mag_table, mag_tree)
Normalization
# Normalizing the matrix
# Dividing rows by the genome size for the bin
# Genome size is different from Bin size
# Genome size is BinSize * BinCompletion
# Read in checkm data to calculate the expected
checkm <- read.csv("Tara_Oceans_Med/TOBG-MED_qa.txt", sep = "\t", as.is = TRUE) %>%
dplyr::select("Bin.Id", "Completeness", "Genome.size..bp.") %>%
dplyr::rename(est_genome_size = "Genome.size..bp.") %>%
# Make a new column for the expected genome size based on est_genome_size * completeness
mutate(exp_genome_size = round(est_genome_size/(Completeness/100)))
ordered_checkm <- data.frame(Bin.Id = rownames(mag_table)) %>%
left_join(checkm, by = "Bin.Id")
#Sanity check
stopifnot(rownames(mag_table) == ordered_checkm$Bin.Id)
# Do the normalization
t_mat <- t(as.matrix(mag_table))
# divide the matrix columns by the genome size of each MAG
norm_mat <- t_mat/ordered_checkm$exp_genome_size
t_norm_mat <- t(norm_mat)
# Combine into a normalized phyloseq object
tara_norm_physeq <- phyloseq(meta_physeq, tax_physeq, mag_tree,
otu_table(t_norm_mat, taxa_are_rows = TRUE))
Heatmap
# A clustered heatmap
heatmap(otu_table(tara_norm_physeq))

# Visualize only the top 50 taxa
top_50MAGs <- names(sort(taxa_sums(tara_norm_physeq), decreasing = TRUE))[1:50]
top_50MAGs_physeq <- prune_taxa(top_50MAGs, tara_norm_physeq)
# Subset only the bacterial samples
girus_top50MAGs <- subset_samples(top_50MAGs_physeq, size_fraction =="girus")
# Melt into long format and fix zeros to avoid infinity values
otu_long_50_girus <- psmelt(otu_table(girus_top50MAGs)) %>%
mutate(log_abund = log2(Abundance + 0.0000001))
# Make a "heatmap" with geom_tile that works with plotly :)
heat_plot <- otu_long_50_girus %>%
ggplot(aes(x=Sample, y=OTU)) +
geom_tile(aes(fill = log_abund)) +
scale_fill_distiller(palette = "YlGnBu") +
labs(title = "Top 50 MAGs") +
theme(axis.text.x = element_text(angle = 30, vjust = 1, hjust = 1))
# Plot the plotly plot!
ggplotly(heat_plot)
Beta diversity plots
# Calculate the bray curtis distances for all samples
tara_norm.ord <- ordinate(tara_norm_physeq, method = "PCoA", distance = "bray")
# Make a faceted plot by size fraction
sizeFrac_PCoA_prov <-
plot_ordination(tara_norm_physeq, tara_norm.ord, color = "province") +
facet_wrap(~size_fraction) +
theme_minimal() +
scale_color_brewer(palette = "Paired")
# Make it a plotly plot
ggplotly(sizeFrac_PCoA_prov)
# Subset the bacterial samples and color by depth
bact <- subset_samples(tara_norm_physeq, size_fraction == "bact")
# Calculate the bray curtis distances for a PCoA
bact.ord <- ordinate(bact, method = "PCoA", distance = "bray")
# Make the plot
bact_PCoA_depth <- plot_ordination(bact, bact.ord, color = "depth")+
theme_minimal() +
geom_point(size = 2) +
scale_color_manual(values = colors4)
# Make it a plotly plot
ggplotly(bact_PCoA_depth)
Abundance plots
# Select the top 4 taxa
top_taxa <- names(sort(taxa_sums(bact),decreasing = TRUE))[1:4]
top_bact <- prune_taxa(top_taxa, bact)
# Melt the data to the long format
top_bact_df <- psmelt(top_bact)
# Plot it! :)
abundplot <- ggplot(top_bact_df, aes(x=province, y= Abundance, color = depth)) +
facet_wrap(~OTU, scales = "free_y") +
geom_jitter(size = 2) + theme_minimal()+
labs(y = "Normalized MAG Abundance") +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 1),
axis.title.x = element_blank()) +
scale_color_manual(values = colors4)
# Make it interactive
ggplotly(abundplot)